<!DOCTYPE html><html lang="zh-CN" data-theme="light"><head><meta charset="UTF-8"><meta http-equiv="X-UA-Compatible" content="IE=edge"><meta name="viewport" content="width=device-width,initial-scale=1"><title>读懂西瓜书 16 : 强化学习 | 云玩家</title><meta name="keywords" content="机器学习,强化学习,西瓜书"><meta name="author" content="云玩家"><meta name="copyright" content="云玩家"><meta name="format-detection" content="telephone=no"><meta name="theme-color" content="#ffffff"><meta name="description" content="K-摇臂赌博机探索与利用探索与利用总是矛盾的, 要使奖赏最大, 就要在探索与利用之间做好权衡. ϵ-贪心以 $\epsilon$ 的概率进行探索, 以 $1-\epsilon$ 的概率进行利用. 增量式计算令 $Q(k)$ 记录摇臂 $k$ 的平均奖赏. 若摇臂 $k$ 被尝试了 $n$ 次, 得到的奖赏是 $v_1,v_2,\dots,v_n$ , 则平均奖赏为$$Q(k)&#x3D;\frac{1}{n">
<meta property="og:type" content="article">
<meta property="og:title" content="读懂西瓜书 16 : 强化学习">
<meta property="og:url" content="http://yunist.cn/ML/watermelon_book/read/16/index.html">
<meta property="og:site_name" content="云玩家">
<meta property="og:description" content="K-摇臂赌博机探索与利用探索与利用总是矛盾的, 要使奖赏最大, 就要在探索与利用之间做好权衡. ϵ-贪心以 $\epsilon$ 的概率进行探索, 以 $1-\epsilon$ 的概率进行利用. 增量式计算令 $Q(k)$ 记录摇臂 $k$ 的平均奖赏. 若摇臂 $k$ 被尝试了 $n$ 次, 得到的奖赏是 $v_1,v_2,\dots,v_n$ , 则平均奖赏为$$Q(k)&#x3D;\frac{1}{n">
<meta property="og:locale" content="zh_CN">
<meta property="og:image" content="http://yunist.cn/ML/watermelon_book/read/16/16.jpg">
<meta property="article:published_time" content="2020-04-25T09:06:40.000Z">
<meta property="article:modified_time" content="2021-08-14T04:39:34.341Z">
<meta property="article:author" content="云玩家">
<meta property="article:tag" content="机器学习">
<meta property="article:tag" content="强化学习">
<meta property="article:tag" content="西瓜书">
<meta name="twitter:card" content="summary">
<meta name="twitter:image" content="http://yunist.cn/ML/watermelon_book/read/16/16.jpg"><link rel="shortcut icon" href="/img/favicon.ico"><link rel="canonical" href="http://yunist.cn/ML/watermelon_book/read/16/"><link rel="preconnect" href="//cdn.jsdelivr.net"/><link rel="preconnect" href="//s4.cnzz.com"/><link rel="preconnect" href="//busuanzi.ibruce.info"/><link rel="stylesheet" href="/css/index.css"><link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/@fortawesome/fontawesome-free/css/all.min.css" media="print" onload="this.media='all'"><script async="async" data-pjax="data-pjax" src="https://s4.cnzz.com/z_stat.php?id=1278869595&amp;web_id=1278869595"></script><script>const GLOBAL_CONFIG = { 
  root: '/',
  algolia: undefined,
  localSearch: {"path":"search.xml","languages":{"hits_empty":"找不到您查询的内容：${query}"}},
  translate: undefined,
  noticeOutdate: undefined,
  highlight: {"plugin":"highlighjs","highlightCopy":true,"highlightLang":true},
  copy: {
    success: '复制成功',
    error: '复制错误',
    noSupport: '浏览器不支持'
  },
  relativeDate: {
    homepage: false,
    post: false
  },
  runtime: '',
  date_suffix: {
    just: '刚刚',
    min: '分钟前',
    hour: '小时前',
    day: '天前',
    month: '个月前'
  },
  copyright: undefined,
  lightbox: 'fancybox',
  Snackbar: undefined,
  source: {
    jQuery: 'https://cdn.jsdelivr.net/npm/jquery@latest/dist/jquery.min.js',
    justifiedGallery: {
      js: 'https://cdn.jsdelivr.net/npm/justifiedGallery/dist/js/jquery.justifiedGallery.min.js',
      css: 'https://cdn.jsdelivr.net/npm/justifiedGallery/dist/css/justifiedGallery.min.css'
    },
    fancybox: {
      js: 'https://cdn.jsdelivr.net/npm/@fancyapps/fancybox@latest/dist/jquery.fancybox.min.js',
      css: 'https://cdn.jsdelivr.net/npm/@fancyapps/fancybox@latest/dist/jquery.fancybox.min.css'
    }
  },
  isPhotoFigcaption: false,
  islazyload: false,
  isanchor: false
}</script><script src="/js/yuntools.js"></script><script id="config-diff">var GLOBAL_CONFIG_SITE = { 
  isPost: true,
  isHome: false,
  isHighlightShrink: false,
  isToc: true,
  postUpdate: '2021-08-14 12:39:34'
}</script><noscript><style type="text/css">
  #nav {
    opacity: 1
  }
  .justified-gallery img {
    opacity: 1
  }

  #recent-posts time,
  #post-meta time {
    display: inline !important
  }
</style></noscript><script>(win=>{
    win.saveToLocal = {
      set: function setWithExpiry(key, value, ttl) {
        if (ttl === 0) return
        const now = new Date()
        const expiryDay = ttl * 86400000
        const item = {
          value: value,
          expiry: now.getTime() + expiryDay,
        }
        localStorage.setItem(key, JSON.stringify(item))
      },

      get: function getWithExpiry(key) {
        const itemStr = localStorage.getItem(key)

        if (!itemStr) {
          return undefined
        }
        const item = JSON.parse(itemStr)
        const now = new Date()

        if (now.getTime() > item.expiry) {
          localStorage.removeItem(key)
          return undefined
        }
        return item.value
      }
    }
  
    win.getScript = url => new Promise((resolve, reject) => {
      const script = document.createElement('script')
      script.src = url
      script.async = true
      script.onerror = reject
      script.onload = script.onreadystatechange = function() {
        const loadState = this.readyState
        if (loadState && loadState !== 'loaded' && loadState !== 'complete') return
        script.onload = script.onreadystatechange = null
        resolve()
      }
      document.head.appendChild(script)
    })
  
      win.activateDarkMode = function () {
        document.documentElement.setAttribute('data-theme', 'dark')
        if (document.querySelector('meta[name="theme-color"]') !== null) {
          document.querySelector('meta[name="theme-color"]').setAttribute('content', '#0d0d0d')
        }
      }
      win.activateLightMode = function () {
        document.documentElement.setAttribute('data-theme', 'light')
        if (document.querySelector('meta[name="theme-color"]') !== null) {
          document.querySelector('meta[name="theme-color"]').setAttribute('content', '#ffffff')
        }
      }
      const t = saveToLocal.get('theme')
    
          if (t === 'dark') activateDarkMode()
          else if (t === 'light') activateLightMode()
        
      const asideStatus = saveToLocal.get('aside-status')
      if (asideStatus !== undefined) {
        if (asideStatus === 'hide') {
          document.documentElement.classList.add('hide-aside')
        } else {
          document.documentElement.classList.remove('hide-aside')
        }
      }
    })(window)</script><link rel="stylesheet" href="https://cdn.jsdelivr.net/gh/cnyist/blog/css/font.css"><meta name="generator" content="Hexo 5.4.0"></head><body><div id="web_bg"></div><div id="sidebar"><div id="menu-mask"></div><div id="sidebar-menus"><div class="author-avatar"><img class="avatar-img" src="https://cdn.jsdelivr.net/gh/cnyist/blog/img/avatar.svg" onerror="onerror=null;src='/img/friend_404.gif'" alt="avatar"/></div><div class="site-data"><div class="data-item is-center"><div class="data-item-link"><a href="/archives/"><div class="headline">文章</div><div class="length-num">97</div></a></div></div><div class="data-item is-center"><div class="data-item-link"><a href="/tags/"><div class="headline">标签</div><div class="length-num">40</div></a></div></div><div class="data-item is-center"><div class="data-item-link"><a href="/categories/"><div class="headline">分类</div><div class="length-num">27</div></a></div></div></div><hr/><div class="menus_items"><div class="menus_item"><a class="site-page" href="/"><i class="fa-fw fas fa-home"></i><span> 主页</span></a></div><div class="menus_item"><a class="site-page" href="/archives/"><i class="fa-fw fas fa-archive"></i><span> 时间轴</span></a></div><div class="menus_item"><a class="site-page" href="/tags/"><i class="fa-fw fas fa-tags"></i><span> 标签</span></a></div><div class="menus_item"><a class="site-page" href="/categories/"><i class="fa-fw fas fa-folder-open"></i><span> 分类</span></a></div><div class="menus_item"><a class="site-page" href="/link/"><i class="fa-fw fas fa-link"></i><span> 友链</span></a></div><div class="menus_item"><a class="site-page" href="/chat/"><i class="fa-fw fas fa-comments"></i><span> 闲聊室</span></a></div><div class="menus_item"><a class="site-page" href="/about/"><i class="fa-fw fas fa-user"></i><span> about</span></a></div><div class="menus_item"><a class="site-page" href="javascript:void(0);"><i class="fa-fw iconfont icon-Web"></i><span> 镜像站点</span><i class="fas fa-chevron-down expand"></i></a><ul class="menus_item_child"><li><a class="site-page" target="_blank" rel="noopener" href="https://yunist.gitee.io"><i class="fa-fw iconfont icon-gitee-fill-round"></i><span> Gitee Pages</span></a></li></ul></div></div></div></div><div class="post" id="body-wrap"><header class="post-bg" id="page-header" style="background-image: url('/ML/watermelon_book/read/16/16.jpg')"><nav id="nav"><span id="blog_name"><a id="site-name" href="/">云玩家</a></span><div id="menus"><div id="search-button"><a class="site-page social-icon search"><i class="fas fa-search fa-fw"></i><span> 搜索</span></a></div><div class="menus_items"><div class="menus_item"><a class="site-page" href="/"><i class="fa-fw fas fa-home"></i><span> 主页</span></a></div><div class="menus_item"><a class="site-page" href="/archives/"><i class="fa-fw fas fa-archive"></i><span> 时间轴</span></a></div><div class="menus_item"><a class="site-page" href="/tags/"><i class="fa-fw fas fa-tags"></i><span> 标签</span></a></div><div class="menus_item"><a class="site-page" href="/categories/"><i class="fa-fw fas fa-folder-open"></i><span> 分类</span></a></div><div class="menus_item"><a class="site-page" href="/link/"><i class="fa-fw fas fa-link"></i><span> 友链</span></a></div><div class="menus_item"><a class="site-page" href="/chat/"><i class="fa-fw fas fa-comments"></i><span> 闲聊室</span></a></div><div class="menus_item"><a class="site-page" href="/about/"><i class="fa-fw fas fa-user"></i><span> about</span></a></div><div class="menus_item"><a class="site-page" href="javascript:void(0);"><i class="fa-fw iconfont icon-Web"></i><span> 镜像站点</span><i class="fas fa-chevron-down expand"></i></a><ul class="menus_item_child"><li><a class="site-page" target="_blank" rel="noopener" href="https://yunist.gitee.io"><i class="fa-fw iconfont icon-gitee-fill-round"></i><span> Gitee Pages</span></a></li></ul></div></div><div id="toggle-menu"><a class="site-page"><i class="fas fa-bars fa-fw"></i></a></div></div></nav><div id="post-info"><h1 class="post-title">读懂西瓜书 16 : 强化学习</h1><div id="post-meta"><div class="meta-firstline"><span class="post-meta-date"><i class="far fa-calendar-alt fa-fw post-meta-icon"></i><span class="post-meta-label">发表于</span><time class="post-meta-date-created" datetime="2020-04-25T09:06:40.000Z" title="发表于 2020-04-25 17:06:40">2020-04-25</time><span class="post-meta-separator">|</span><i class="fas fa-history fa-fw post-meta-icon"></i><span class="post-meta-label">更新于</span><time class="post-meta-date-updated" datetime="2021-08-14T04:39:34.341Z" title="更新于 2021-08-14 12:39:34">2021-08-14</time></span><span class="post-meta-categories"><span class="post-meta-separator">|</span><i class="fas fa-inbox fa-fw post-meta-icon"></i><a class="post-meta-categories" href="/categories/%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0/">机器学习</a><i class="fas fa-angle-right post-meta-separator"></i><i class="fas fa-inbox fa-fw post-meta-icon"></i><a class="post-meta-categories" href="/categories/%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0/%E5%BC%BA%E5%8C%96%E5%AD%A6%E4%B9%A0/">强化学习</a><i class="fas fa-angle-right post-meta-separator"></i><i class="fas fa-inbox fa-fw post-meta-icon"></i><a class="post-meta-categories" href="/categories/%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0/%E8%A5%BF%E7%93%9C%E4%B9%A6/">西瓜书</a><i class="fas fa-angle-right post-meta-separator"></i><i class="fas fa-inbox fa-fw post-meta-icon"></i><a class="post-meta-categories" href="/categories/%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0/%E5%BC%BA%E5%8C%96%E5%AD%A6%E4%B9%A0/%E5%85%A5%E9%97%A8/">入门</a><i class="fas fa-angle-right post-meta-separator"></i><i class="fas fa-inbox fa-fw post-meta-icon"></i><a class="post-meta-categories" href="/categories/%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0/%E8%A5%BF%E7%93%9C%E4%B9%A6/%E8%AF%BB%E6%87%82%E8%A5%BF%E7%93%9C%E4%B9%A6/">读懂西瓜书</a></span></div><div class="meta-secondline"><span class="post-meta-separator">|</span><span class="post-meta-pv-cv"><i class="far fa-eye fa-fw post-meta-icon"></i><span class="post-meta-label">阅读量:</span><span id="busuanzi_value_page_pv"></span></span><span class="post-meta-separator">|</span><span class="post-meta-commentcount"><i class="far fa-comments fa-fw post-meta-icon"></i><span class="post-meta-label">评论数:</span><a href="/ML/watermelon_book/read/16/#post-comment"><span id="twikoo-count"></span></a></span></div></div></div></header><main class="layout" id="content-inner"><div id="post"><article class="post-content" id="article-container"><h1 id="K-摇臂赌博机"><a href="#K-摇臂赌博机" class="headerlink" title="K-摇臂赌博机"></a>K-摇臂赌博机</h1><h2 id="探索与利用"><a href="#探索与利用" class="headerlink" title="探索与利用"></a>探索与利用</h2><p>探索与利用总是矛盾的, 要使奖赏最大, 就要在探索与利用之间做好权衡.</p>
<h2 id="ϵ-贪心"><a href="#ϵ-贪心" class="headerlink" title="ϵ-贪心"></a>ϵ-贪心</h2><p>以 $\epsilon$ 的概率进行探索, 以 $1-\epsilon$ 的概率进行利用.</p>
<h3 id="增量式计算"><a href="#增量式计算" class="headerlink" title="增量式计算"></a>增量式计算</h3><p>令 $Q(k)$ 记录摇臂 $k$ 的平均奖赏. 若摇臂 $k$ 被尝试了 $n$ 次, 得到的奖赏是 $v_1,v_2,\dots,v_n$ , 则平均奖赏为<br>$$<br>Q(k)=\frac{1}{n}\sum_{i=1}^nv_i<br>$$<br>由于如果这样计算的话, 需要记录 $n$ 个奖赏值, 算法效率也不高 ( $\mathcal O(n)$ ) , 所以采用增量式计算.<br>$$<br>\begin{aligned}<br>Q_n(k)&amp;=\frac{1}{n}\big((n-1)\times Q_{n-1}(k)+v_n\big)\\<br>&amp;=Q_{n-1}(k)+\frac{1}{n}\big(v_n-Q_{n-1}(k)\big)<br>\end{aligned}<br>$$<br>这样就只需要记录两个值: 已尝试次数 $n-1$ 以及最近平均奖赏 $Q_{n-1}(k)$ . 算法复杂度也下降到 $\mathcal O(1)$ .</p>
<h2 id="Softmax"><a href="#Softmax" class="headerlink" title="Softmax"></a>Softmax</h2><p> &nbsp;$\mathrm{Softmax}$ 可以看做是高维的 $\mathrm{Sigmoid}$ 函数. 使用 $\mathrm{Softmax}$ 可以让那些奖赏更高的摇臂有更高的被选取概率.<br>$$<br>p(k) = \frac{\mathrm{e}^{\frac{Q(k)}{\tau}}}{\sum_{i=1}^K\mathrm{e}^{\frac{Q(i)}{\tau}}}<br>$$<br>其中 $\tau&gt;0$ 称为 “温度” . 温度趋近于 $0$ , 将倾向于 “仅利用” , 温度趋近于无穷大, 将倾向与 “仅探索” . 我们假设 $k’$ 是奖赏最高的那个摇臂, 那么有<br>$$<br>\begin{aligned}<br>p(k’)=\lim_{\tau\to0 }\frac{\mathrm{e}^{\frac{Q(k’)}{\tau}}}{\sum_{i=1}^K\mathrm{e}^{\frac{Q(i)}{\tau}}}&amp;=\lim_{\tau\to0 }\frac{1}{1+\sum_{i=1,i\not=k’}^K\frac{\mathrm{e}^{(Q(i)/\tau)}}{\mathrm{e^{(Q(k’)/\tau)}}}}\\<br>\end{aligned}<br>$$<br>注意到求和的每一项都趋近于 $0$ , 于是有<br>$$<br>\begin{aligned}<br>p(k’)=\lim_{\tau\to0 }\frac{\mathrm{e}^{\frac{Q(k)}{\tau}}}{\sum_{i=1}^K\mathrm{e}^{\frac{Q(i)}{\tau}}}&amp;=\lim_{\tau\to0 }\frac{1}{1+\sum_{i=1,i\not=k’}^K\frac{\mathrm{e}^{(Q(i)/\tau)}}{\mathrm{e^{(Q(k’)/\tau)}}}}\\<br>&amp;=\frac{1}{1+0}\\<br>&amp;=1<br>\end{aligned}<br>$$<br>趋近于无穷大是相反的一个过程.</p>
<h1 id="有模型学习"><a href="#有模型学习" class="headerlink" title="有模型学习"></a>有模型学习</h1><h2 id="策略评估"><a href="#策略评估" class="headerlink" title="策略评估"></a>策略评估</h2><h3 id="式-16-7-与-16-8"><a href="#式-16-7-与-16-8" class="headerlink" title="式 (16.7) 与 (16.8)"></a>式 (16.7) 与 (16.8)</h3><p>对于这两个式子, 注意到<br>$$<br>\sum_{a\in A}\pi(x,a)\sum_{x’\in X}P_{x\to x’}^a<br>$$<br>计算了下一步到状态 $x’$ 的概率. 而<br>$$<br>\sum_{a\in A}\pi(x,a)\sum_{x’\in X}\left(P_{x\to x’}^a\frac{1}{T}R_{x\to x’}^a\right)<br>$$<br>则计算了假如下一步转移到状态 $x’$ , 这一步的期望奖赏. (这里用的是 $T$ 步积累奖赏, $\gamma$ 折扣累积奖赏也是类似的) .</p>
<h2 id="策略改进"><a href="#策略改进" class="headerlink" title="策略改进"></a>策略改进</h2><p>原来的策略评估只是将所有可能的动作产生的期望以概率求和, 那么为什么不直接选择最好的那个动作呢? 这就叫做策略改进. 由于每一步改进都会变得更好, 于是到最后不能变得再好的时候 (也就是收敛) 时, 就达到了最优策略.</p>
<h2 id="策略迭代与值迭代"><a href="#策略迭代与值迭代" class="headerlink" title="策略迭代与值迭代"></a>策略迭代与值迭代</h2><p>第一个算法是将两个部分分开, 效率较低, 而第二个算法同时进行两个部分, 效率较高.</p>
<h1 id="免模型学习"><a href="#免模型学习" class="headerlink" title="免模型学习"></a>免模型学习</h1><p>​           没有模型, 我们就不能方便的计算出每一步的期望了, 因此就要近似的计算.</p>
<h2 id="蒙特卡罗强化学习"><a href="#蒙特卡罗强化学习" class="headerlink" title="蒙特卡罗强化学习"></a>蒙特卡罗强化学习</h2><p>通过蒙特卡罗随机游走, 采样出 $n$ 条轨迹, 为了保证每条轨迹都不同, 因此采用了 $\epsilon$-贪心策略, 即以 $1-\epsilon$ 的概率选取确定性的策略 $\pi$ (在这里也可以等价为最优动作) 来作为当前动作, 以 $\epsilon$ 的概率选取随机动作 (同样包括策略 $\pi$ ) . 因此策略 $\pi$ 被选取到的概率是 $1-\epsilon+\frac{\epsilon}{|A|}$ , $A$ 为动作集合.</p>
<h3 id="同策略"><a href="#同策略" class="headerlink" title="同策略"></a>同策略</h3><p>同策略即评估和改进的是同一个策略, 即 $\epsilon$-贪心策略. 但是 $\epsilon$-贪心策略本来是帮助策略评估 (走出不同轨迹) 的, 而不是为了最终使用, 于是就有了异策略.</p>
<h3 id="异策略"><a href="#异策略" class="headerlink" title="异策略"></a>异策略</h3><p>异策略采用了两种策略, 一种用来评估, 一种用来改进. 其实现原理与拒绝-接受采样差不多.</p>
<h2 id="时序差分学习"><a href="#时序差分学习" class="headerlink" title="时序差分学习"></a>时序差分学习</h2><p>由于蒙特卡罗强化学习需要对策略评估后才能做出改进, 而前面的策略迭代与值迭代算法的更新是实时的, 于是蒙特卡罗强化学习方法就显得效率太低, 这是由于没有充分利用强化学习的 $\mathrm{MDP}$ 结构. $\mathrm{MDP}$ 结构的一大特点就是无后效性, 但是蒙特卡罗强化学习的 “全部评估-全部学习” 方法没有很好的利用这一点.</p>
<p>时序差分学习与蒙特卡罗强化学习的差别就在于, 蒙特卡罗强化学习在求平均 (近似期望) 的时候是 “批处理式” 的, 而时序差分学习是 “增量式” 的.<br>$$<br>Q_{t+1}^\pi(x,a)=Q_{t}^\pi(x,a)+\frac{1}{t+1}\big(r_t+1-Q_{t}^\pi(x,a)\big)<br>$$<br>通常将 $\frac{1}{t+1}$ 替换为系数 $\alpha_{t+1}$ , 在实践中通常令 $\alpha_{t+1}$ 为一个较小的正数值 $\alpha$ . 当然, 这样就不是求平均了, 但是将 $Q_{t}^\pi(x,a)$ 展开, 你会发现系数之和还是 $1$ , 只不过越往后系数越大, 也就是越靠后积累的奖赏越重要, 随着 $\alpha$ 不断增大, 这一特点也在不断扩大. <strong>总而言之, 时序差分学习与蒙特卡罗强化学习的区别就类似于策略迭代与值迭代的两个算法之间的区别.</strong></p>
<p>时序差分学习同样分为同策略与异策略, 与蒙特卡罗强化学习类似.</p>
<h1 id="值函数近似"><a href="#值函数近似" class="headerlink" title="值函数近似"></a>值函数近似</h1><p>值函数近似的过程其实就是回归. 用 $\boldsymbol{\theta}^{\mathrm{T}}\boldsymbol{x}$ 和 $\boldsymbol{\theta}^{\mathrm{T}}(\boldsymbol{x},a)$ 来近似 $V^\pi(\boldsymbol{x})$ 和 $Q^\pi(\boldsymbol{x},a)$ , 这样就能解决连续空间有无穷多状态的问题. 至于回归用的算法, 可以自己决定.</p>
<h1 id="模仿学习"><a href="#模仿学习" class="headerlink" title="模仿学习"></a>模仿学习</h1><h2 id="直接模仿学习"><a href="#直接模仿学习" class="headerlink" title="直接模仿学习"></a>直接模仿学习</h2><p>可以说就是 “预测” 人类专家的动作. 对于离散的动作, 使用的是分类, 对于连续的动作, 使用的是回归. 利用这样的方式学得人类专家的决策, 然后再通过强化学习进一步改进. 也就是 “监督学习” + “强化学习” = “直接模仿学习” .</p>
<h2 id="逆强化学习"><a href="#逆强化学习" class="headerlink" title="逆强化学习"></a>逆强化学习</h2><p>直接模仿学习是 “直接” 学习现有的范例, 然后根据已有的奖赏函数继续学习. 而逆强化学习的 “逆” 就体现在, 它根据现有的范例, <strong>反推出</strong>奖赏函数, 然后利用反推出的奖赏函数学习. 而反推出的奖赏函数要符合什么性质呢, 就是要保证在此奖赏函数下, 范例就是最优策略. 由于所有策略几乎不可能都尝试一遍, 因此往往采用迭代的方式, 先随机生成一个策略然后求取奖赏函数, 然后根据奖赏函数再求取策略, 如此反复直到策略和奖赏函数都符合范例.</p>
</article><div class="post-copyright"><div class="post-copyright__author"><span class="post-copyright-meta">文章作者: </span><span class="post-copyright-info"><a href="mailto:undefined">云玩家</a></span></div><div class="post-copyright__type"><span class="post-copyright-meta">文章链接: </span><span class="post-copyright-info"><a href="http://yunist.cn/ML/watermelon_book/read/16/">http://yunist.cn/ML/watermelon_book/read/16/</a></span></div><div class="post-copyright__notice"><span class="post-copyright-meta">版权声明: </span><span class="post-copyright-info">本博客所有文章除特别声明外，均采用 <a href="https://creativecommons.org/licenses/by-nc-sa/4.0/" target="_blank">CC BY-NC-SA 4.0</a> 许可协议。转载请注明来自 <a href="http://yunist.cn" target="_blank">云玩家</a>！</span></div></div><div class="tag_share"><div class="post-meta__tag-list"><a class="post-meta__tags" href="/tags/%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0/">机器学习</a><a class="post-meta__tags" href="/tags/%E5%BC%BA%E5%8C%96%E5%AD%A6%E4%B9%A0/">强化学习</a><a class="post-meta__tags" href="/tags/%E8%A5%BF%E7%93%9C%E4%B9%A6/">西瓜书</a></div><div class="post_share"><div class="social-share" data-image="/ML/watermelon_book/read/16/16.jpg" data-sites="facebook,twitter,wechat,weibo,qq"></div><link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/social-share.js/dist/css/share.min.css" media="print" onload="this.media='all'"><script src="https://cdn.jsdelivr.net/npm/social-share.js/dist/js/social-share.min.js" defer></script></div></div><nav class="pagination-post" id="pagination"><div class="prev-post pull-left"><!-- - var pagination_cover = prev.cover === false ? prev.randomcover : prev.cover--><a href="/ML/RL/primer/RL_demo/"><img class="prev-cover" src="https://cdn.jsdelivr.net/gh/cnyist/blog/ML/RL/primer/RL_demo/RL_demo.png" onerror="onerror=null;src='/img/404.jpg'" alt="cover of previous post"><div class="pagination-info"><div class="label">上一篇</div><div class="prev_info">强化学习入门 Demo</div></div></a></div><!-- - var pagination_cover = next.cover == false ? next.randomcover : next.cover--><div class="next-post pull-right"><a href="/scribble/obsessive_LaTeX/"><img class="next-cover" src="https://cdn.jsdelivr.net/gh/cnyist/blog/scribble/obsessive_LaTeX/obsessive_LaTeX.jpg" onerror="onerror=null;src='/img/404.jpg'" alt="cover of next post"><div class="pagination-info"><div class="label">下一篇</div><div class="next_info">强迫症的 LaTeX</div></div></a></div></nav><div class="relatedPosts"><div class="headline"><i class="fas fa-thumbs-up fa-fw"></i><span> 相关推荐</span></div><div class="relatedPosts-list"><div><a href="/ML/RL/papers_read/RL_papers_word/" title="强化学习类论文常用表达"><img class="cover" src="https://cdn.jsdelivr.net/gh/cnyist/blog/ML/RL/papers_read/RL_papers_word/RL_papers_word.jpg" alt="cover"><div class="content is-center"><div class="date"><i class="far fa-calendar-alt fa-fw"></i> 2020-09-13</div><div class="title">强化学习类论文常用表达</div></div></a></div><div><a href="/ML/RL/primer/Intro_to_Policy_Optimization_code/" title="Intro to Policy Optimization 代码详解"><img class="cover" src="https://cdn.jsdelivr.net/gh/cnyist/blog/ML/RL/primer/Intro_to_Policy_Optimization_code/Intro_to_Policy_Optimization_code.png" alt="cover"><div class="content is-center"><div class="date"><i class="far fa-calendar-alt fa-fw"></i> 2020-06-25</div><div class="title">Intro to Policy Optimization 代码详解</div></div></a></div><div><a href="/ML/RL/primer/RL_demo/" title="强化学习入门 Demo"><img class="cover" src="https://cdn.jsdelivr.net/gh/cnyist/blog/ML/RL/primer/RL_demo/RL_demo.png" alt="cover"><div class="content is-center"><div class="date"><i class="far fa-calendar-alt fa-fw"></i> 2020-04-27</div><div class="title">强化学习入门 Demo</div></div></a></div><div><a href="/ML/RL/primer/algorithm_classify/" title="RL 算法分类"><img class="cover" src="https://cdn.jsdelivr.net/gh/cnyist/blog/ML/RL/primer/algorithm_classify/1.svg" alt="cover"><div class="content is-center"><div class="date"><i class="far fa-calendar-alt fa-fw"></i> 2020-05-23</div><div class="title">RL 算法分类</div></div></a></div><div><a href="/ML/RL/primer/basic_concepts/" title="RL 基本概念"><img class="cover" src="https://cdn.jsdelivr.net/gh/cnyist/blog/ML/RL/primer/basic_concepts/basic_concepts.jpg" alt="cover"><div class="content is-center"><div class="date"><i class="far fa-calendar-alt fa-fw"></i> 2020-05-03</div><div class="title">RL 基本概念</div></div></a></div><div><a href="/ML/RL/reinforcement_learning/1/" title="《强化学习》(第 2 版) 习题 1"><img class="cover" src="https://cdn.jsdelivr.net/gh/cnyist/blog/ML/RL/reinforcement_learning/1/RL.png" alt="cover"><div class="content is-center"><div class="date"><i class="far fa-calendar-alt fa-fw"></i> 2021-06-17</div><div class="title">《强化学习》(第 2 版) 习题 1</div></div></a></div></div></div><hr/><div id="post-comment"><div class="comment-head"><div class="comment-headline"><i class="fas fa-comments fa-fw"></i><span> 评论</span></div></div><div class="comment-wrap"><div><div id="twikoo-wrap"></div></div></div></div></div><div class="aside-content" id="aside-content"><div class="card-widget card-info"><div class="card-info-avatar is-center"><img class="avatar-img" src="https://cdn.jsdelivr.net/gh/cnyist/blog/img/avatar.svg" onerror="this.onerror=null;this.src='/img/friend_404.gif'" alt="avatar"/><div class="author-info__name">云玩家</div><div class="author-info__description">云玩家's blog</div></div><div class="card-info-data"><div class="card-info-data-item is-center"><a href="/archives/"><div class="headline">文章</div><div class="length-num">97</div></a></div><div class="card-info-data-item is-center"><a href="/tags/"><div class="headline">标签</div><div class="length-num">40</div></a></div><div class="card-info-data-item is-center"><a href="/categories/"><div class="headline">分类</div><div class="length-num">27</div></a></div></div><a class="button--animated" id="card-info-btn" target="_blank" rel="noopener" href="https://github.com/xxxxxx"><i class="fab fa-github"></i><span>Follow Me</span></a><div class="card-info-social-icons is-center"><a class="social-icon" href="https://github.com/cnyist" target="_blank" title="Github"><i class="fab fa-github"></i></a><a class="social-icon" href="https://blog.csdn.net/chnyist" target="_blank" title="CSDN"><i class="fas iconfont icon-CN_csdnnet"></i></a><a class="social-icon" href="https://www.zhihu.com/people/cnyist" target="_blank" title="知乎"><i class="fas iconfont icon-zhihu"></i></a><a class="social-icon" href="mailto:yunist@qq.com" target="_blank" title="Email"><i class="fas fa-envelope"></i></a></div></div><div class="card-widget card-announcement"><div class="item-headline"><i class="fas fa-bullhorn card-announcement-animation"></i><span>公告</span></div><div class="announcement_content">This is my Blog</div></div><div class="sticky_layout"><div class="card-widget" id="card-toc"><div class="item-headline"><i class="fas fa-stream"></i><span>目录</span></div><div class="toc-content"><ol class="toc"><li class="toc-item toc-level-1"><a class="toc-link" href="#K-%E6%91%87%E8%87%82%E8%B5%8C%E5%8D%9A%E6%9C%BA"><span class="toc-number">1.</span> <span class="toc-text">K-摇臂赌博机</span></a><ol class="toc-child"><li class="toc-item toc-level-2"><a class="toc-link" href="#%E6%8E%A2%E7%B4%A2%E4%B8%8E%E5%88%A9%E7%94%A8"><span class="toc-number">1.1.</span> <span class="toc-text">探索与利用</span></a></li><li class="toc-item toc-level-2"><a class="toc-link" href="#%CF%B5-%E8%B4%AA%E5%BF%83"><span class="toc-number">1.2.</span> <span class="toc-text">ϵ-贪心</span></a><ol class="toc-child"><li class="toc-item toc-level-3"><a class="toc-link" href="#%E5%A2%9E%E9%87%8F%E5%BC%8F%E8%AE%A1%E7%AE%97"><span class="toc-number">1.2.1.</span> <span class="toc-text">增量式计算</span></a></li></ol></li><li class="toc-item toc-level-2"><a class="toc-link" href="#Softmax"><span class="toc-number">1.3.</span> <span class="toc-text">Softmax</span></a></li></ol></li><li class="toc-item toc-level-1"><a class="toc-link" href="#%E6%9C%89%E6%A8%A1%E5%9E%8B%E5%AD%A6%E4%B9%A0"><span class="toc-number">2.</span> <span class="toc-text">有模型学习</span></a><ol class="toc-child"><li class="toc-item toc-level-2"><a class="toc-link" href="#%E7%AD%96%E7%95%A5%E8%AF%84%E4%BC%B0"><span class="toc-number">2.1.</span> <span class="toc-text">策略评估</span></a><ol class="toc-child"><li class="toc-item toc-level-3"><a class="toc-link" href="#%E5%BC%8F-16-7-%E4%B8%8E-16-8"><span class="toc-number">2.1.1.</span> <span class="toc-text">式 (16.7) 与 (16.8)</span></a></li></ol></li><li class="toc-item toc-level-2"><a class="toc-link" href="#%E7%AD%96%E7%95%A5%E6%94%B9%E8%BF%9B"><span class="toc-number">2.2.</span> <span class="toc-text">策略改进</span></a></li><li class="toc-item toc-level-2"><a class="toc-link" href="#%E7%AD%96%E7%95%A5%E8%BF%AD%E4%BB%A3%E4%B8%8E%E5%80%BC%E8%BF%AD%E4%BB%A3"><span class="toc-number">2.3.</span> <span class="toc-text">策略迭代与值迭代</span></a></li></ol></li><li class="toc-item toc-level-1"><a class="toc-link" href="#%E5%85%8D%E6%A8%A1%E5%9E%8B%E5%AD%A6%E4%B9%A0"><span class="toc-number">3.</span> <span class="toc-text">免模型学习</span></a><ol class="toc-child"><li class="toc-item toc-level-2"><a class="toc-link" href="#%E8%92%99%E7%89%B9%E5%8D%A1%E7%BD%97%E5%BC%BA%E5%8C%96%E5%AD%A6%E4%B9%A0"><span class="toc-number">3.1.</span> <span class="toc-text">蒙特卡罗强化学习</span></a><ol class="toc-child"><li class="toc-item toc-level-3"><a class="toc-link" href="#%E5%90%8C%E7%AD%96%E7%95%A5"><span class="toc-number">3.1.1.</span> <span class="toc-text">同策略</span></a></li><li class="toc-item toc-level-3"><a class="toc-link" href="#%E5%BC%82%E7%AD%96%E7%95%A5"><span class="toc-number">3.1.2.</span> <span class="toc-text">异策略</span></a></li></ol></li><li class="toc-item toc-level-2"><a class="toc-link" href="#%E6%97%B6%E5%BA%8F%E5%B7%AE%E5%88%86%E5%AD%A6%E4%B9%A0"><span class="toc-number">3.2.</span> <span class="toc-text">时序差分学习</span></a></li></ol></li><li class="toc-item toc-level-1"><a class="toc-link" href="#%E5%80%BC%E5%87%BD%E6%95%B0%E8%BF%91%E4%BC%BC"><span class="toc-number">4.</span> <span class="toc-text">值函数近似</span></a></li><li class="toc-item toc-level-1"><a class="toc-link" href="#%E6%A8%A1%E4%BB%BF%E5%AD%A6%E4%B9%A0"><span class="toc-number">5.</span> <span class="toc-text">模仿学习</span></a><ol class="toc-child"><li class="toc-item toc-level-2"><a class="toc-link" href="#%E7%9B%B4%E6%8E%A5%E6%A8%A1%E4%BB%BF%E5%AD%A6%E4%B9%A0"><span class="toc-number">5.1.</span> <span class="toc-text">直接模仿学习</span></a></li><li class="toc-item toc-level-2"><a class="toc-link" href="#%E9%80%86%E5%BC%BA%E5%8C%96%E5%AD%A6%E4%B9%A0"><span class="toc-number">5.2.</span> <span class="toc-text">逆强化学习</span></a></li></ol></li></ol></div></div><div class="card-widget card-recent-post"><div class="item-headline"><i class="fas fa-history"></i><span>最新文章</span></div><div class="aside-list"><!-- - let post_cover = article.cover--><div class="aside-list-item"><a class="thumbnail" href="/math/set_theory/unique_ordinal/" title="序数的唯一性"><img src="https://cdn.jsdelivr.net/gh/cnyist/blog/math/set_theory/unique_ordinal/unique_ordinal.png" onerror="this.onerror=null;this.src='/img/404.jpg'" alt="序数的唯一性"/></a><div class="content"><a class="title" href="/math/set_theory/unique_ordinal/" title="序数的唯一性">序数的唯一性</a><time datetime="2022-01-12T08:26:12.000Z" title="发表于 2022-01-12 16:26:12">2022-01-12</time></div></div><!-- - let post_cover = article.cover--><div class="aside-list-item"><a class="thumbnail" href="/math/mathematical_analysis/mathematical_analysis_practice/3/3-1/" title="(史济怀) 数学分析教程上册第 3 版-练习题 3.1"><img src="https://cdn.jsdelivr.net/gh/cnyist/blog/math/mathematical_analysis/mathematical_analysis_practice/3/3-1/math.jpeg" onerror="this.onerror=null;this.src='/img/404.jpg'" alt="(史济怀) 数学分析教程上册第 3 版-练习题 3.1"/></a><div class="content"><a class="title" href="/math/mathematical_analysis/mathematical_analysis_practice/3/3-1/" title="(史济怀) 数学分析教程上册第 3 版-练习题 3.1">(史济怀) 数学分析教程上册第 3 版-练习题 3.1</a><time datetime="2021-10-19T00:30:55.000Z" title="发表于 2021-10-19 08:30:55">2021-10-19</time></div></div><!-- - let post_cover = article.cover--><div class="aside-list-item"><a class="thumbnail" href="/math/mathematical_analysis/mathematical_analysis_practice/2/2-11/" title="(史济怀) 数学分析教程上册第 3 版-练习题 2.11"><img src="https://cdn.jsdelivr.net/gh/cnyist/blog/math/mathematical_analysis/mathematical_analysis_practice/2/2-11/2-11.jpeg" onerror="this.onerror=null;this.src='/img/404.jpg'" alt="(史济怀) 数学分析教程上册第 3 版-练习题 2.11"/></a><div class="content"><a class="title" href="/math/mathematical_analysis/mathematical_analysis_practice/2/2-11/" title="(史济怀) 数学分析教程上册第 3 版-练习题 2.11">(史济怀) 数学分析教程上册第 3 版-练习题 2.11</a><time datetime="2021-09-25T03:37:28.000Z" title="发表于 2021-09-25 11:37:28">2021-09-25</time></div></div><!-- - let post_cover = article.cover--><div class="aside-list-item"><a class="thumbnail" href="/math/mathematical_analysis/mathematical_analysis_practice/2/2-10/" title="(史济怀) 数学分析教程上册第 3 版-练习题 2.10"><img src="https://cdn.jsdelivr.net/gh/cnyist/blog/math/mathematical_analysis/mathematical_analysis_practice/2/2-10/2-10.jpeg" onerror="this.onerror=null;this.src='/img/404.jpg'" alt="(史济怀) 数学分析教程上册第 3 版-练习题 2.10"/></a><div class="content"><a class="title" href="/math/mathematical_analysis/mathematical_analysis_practice/2/2-10/" title="(史济怀) 数学分析教程上册第 3 版-练习题 2.10">(史济怀) 数学分析教程上册第 3 版-练习题 2.10</a><time datetime="2021-09-22T07:36:49.000Z" title="发表于 2021-09-22 15:36:49">2021-09-22</time></div></div><!-- - let post_cover = article.cover--><div class="aside-list-item"><a class="thumbnail" href="/math/mathematical_analysis/mathematical_analysis_practice/2/2-9/" title="(史济怀) 数学分析教程上册第 3 版-练习题 2.9"><img src="https://cdn.jsdelivr.net/gh/cnyist/blog/math/mathematical_analysis/mathematical_analysis_practice/2/2-9/2-9.jpeg" onerror="this.onerror=null;this.src='/img/404.jpg'" alt="(史济怀) 数学分析教程上册第 3 版-练习题 2.9"/></a><div class="content"><a class="title" href="/math/mathematical_analysis/mathematical_analysis_practice/2/2-9/" title="(史济怀) 数学分析教程上册第 3 版-练习题 2.9">(史济怀) 数学分析教程上册第 3 版-练习题 2.9</a><time datetime="2021-09-20T08:19:06.000Z" title="发表于 2021-09-20 16:19:06">2021-09-20</time></div></div></div></div></div></div></main><footer id="footer"><div id="footer-wrap"><div class="copyright">&copy;2020 - 2022 By 云玩家</div><div class="framework-info"><span>框架 </span><a target="_blank" rel="noopener" href="https://hexo.io">Hexo</a><span class="footer-separator">|</span><span>主题 </span><a target="_blank" rel="noopener" href="https://github.com/jerryc127/hexo-theme-butterfly">Butterfly</a></div></div></footer><script>document.getElementById('web_bg').style = 'background-image: url("' + get_rand_jsd_pic('cnyist', 'banner@master', '/', 70, '-min.jpg') + '")'</script></div><div id="rightside"><div id="rightside-config-hide"><button id="readmode" type="button" title="阅读模式"><i class="fas fa-book-open"></i></button><button id="darkmode" type="button" title="浅色和深色模式转换"><i class="fas fa-adjust"></i></button><button id="hide-aside-btn" type="button" title="单栏和双栏切换"><i class="fas fa-arrows-alt-h"></i></button></div><div id="rightside-config-show"><button id="rightside_config" type="button" title="设置"><i class="fas fa-cog fa-spin"></i></button><button class="close" id="mobile-toc-button" type="button" title="目录"><i class="fas fa-list-ul"></i></button><a id="to_comment" href="#post-comment" title="直达评论"><i class="fas fa-comments"></i></a><button id="go-up" type="button" title="回到顶部"><i class="fas fa-arrow-up"></i></button></div></div><div id="local-search"><div class="search-dialog"><div class="search-dialog__title" id="local-search-title">本地搜索</div><div id="local-input-panel"><div id="local-search-input"><div class="local-search-box"><input class="local-search-box--input" placeholder="搜索文章" type="text"/></div></div></div><hr/><div id="local-search-results"></div><span class="search-close-button"><i class="fas fa-times"></i></span></div><div id="search-mask"></div></div><div><script src="/js/utils.js"></script><script src="/js/main.js"></script><script src="/js/search/local-search.js"></script><div class="js-pjax"><script>if (!window.MathJax) {
  window.MathJax = {
    loader: {
      source: {
        '[tex]/amsCd': '[tex]/amscd'
      }
    },
    tex: {
      inlineMath: [ ['$','$'], ["\\(","\\)"]],
      tags: 'ams'
    },
    options: {
      renderActions: {
        findScript: [10, doc => {
          for (const node of document.querySelectorAll('script[type^="math/tex"]')) {
            const display = !!node.type.match(/; *mode=display/)
            const math = new doc.options.MathItem(node.textContent, doc.inputJax[0], display)
            const text = document.createTextNode('')
            node.parentNode.replaceChild(text, node)
            math.start = {node: text, delim: '', n: 0}
            math.end = {node: text, delim: '', n: 0}
            doc.math.push(math)
          }
        }, ''],
        addClass: [200,() => {
          document.querySelectorAll('mjx-container:not([display=\'true\']').forEach( node => {
            const target = node.parentNode
            if (!target.classList.contains('has-jax')) {
              target.classList.add('mathjax-overflow')
            }
          })
        }, '', false]
      }
    }
  }
  
  const script = document.createElement('script')
  script.src = 'https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js'
  script.id = 'MathJax-script'
  script.async = true
  document.head.appendChild(script)
} else {
  MathJax.startup.document.state(0)
  MathJax.texReset()
  MathJax.typeset()
}</script><script>(()=>{
  const $countDom = document.getElementById('twikoo-count')
  const init = () => {
    let initData = {
      el: '#twikoo-wrap',
      envId: 'yunist-147dfe',
      region: ''
    }

    if (false) {
      const otherData = false
      initData = Object.assign(initData, otherData)
    }
    
    twikoo.init(initData)
  }

  const getCount = () => {
    twikoo.getCommentsCount({
      envId: 'yunist-147dfe',
      region: '',
      urls: [window.location.pathname],
      includeReply: false
    }).then(function (res) {
      $countDom.innerText = res[0].count
    }).catch(function (err) {
      console.error(err);
    });
  }

  const loadTwikoo = (bool = false) => {
    if (typeof twikoo === 'object') {
      init()
      bool && $countDom && setTimeout(getCount,0)
    } else {
      getScript('https://cdn.jsdelivr.net/npm/twikoo/dist/twikoo.all.min.js').then(()=> {
        init()
        bool && $countDom && setTimeout(getCount,0)
      })
    }
  }

  if ('Twikoo' === 'Twikoo' || !false) {
    if (false) btf.loadComment(document.getElementById('twikoo-wrap'), loadTwikoo)
    else loadTwikoo(true)
  } else {
    window.loadOtherComment = () => {
      loadTwikoo()
    }
  }
})()</script></div><script async data-pjax src="//busuanzi.ibruce.info/busuanzi/2.3/busuanzi.pure.mini.js"></script></div></body></html>